# general visualisation
library('ggplot2') # visualisation
library('scales') # visualisation
library('patchwork') # visualisation
library('RColorBrewer') # visualisation
library('corrplot') # visualisation
# general data manipulation
library('dplyr') # data manipulation
library('readr') # input/output
library('vroom') # input/output
library('skimr') # overview
library('tibble') # data wrangling
library('tidyr') # data wrangling
library('purrr') # data wrangling
library('stringr') # string manipulation
library('forcats') # factor manipulation
# specific visualisation
library('alluvial') # visualisation
library('ggrepel') # visualisation
library('ggforce') # visualisation
library('ggridges') # visualisation
library('gganimate') # animations
library('GGally') # visualisation
library('ggthemes') # visualisation
library('wesanderson') # visualisation
library('kableExtra') # display
# Date + forecast
library('lubridate') # date and time
library('forecast') # time series analysis
library('prophet') # time series analysis
library('timetk') # time series analysis
# Interactivity
library('crosstalk')
library('plotly')
# parallel
library('foreach')
library('doParallel')
get_binCI <- function(x,n) as.list(setNames(binom.test(x,n)$conf.int, c("lwr", "upr")))
library(mgcv)
clean_pages <- vroom(str_c('clean_pages.csv'), delim = ",", col_types = cols())
session_info <- vroom(str_c('sess.csv'), delim = ",", col_types = cols())
signals <- vroom(str_c('signals.csv'), delim = ",", col_types = cols())
clean_signals <- vroom(str_c('csignals.csv'), delim = ",", col_types = cols())
set.seed(4321)
clean_pages <- clean_pages %>%
select(userId, time)
cols <- clean_pages %>%
distinct(userId) %>%
mutate(cols = rep_len(brewer.pal(7, "Set2"), length.out = n_distinct(clean_pages$userId)))
ts_out <- clean_pages %>%
left_join(cols, by = "userId") %>%
mutate(time = as.POSIXct(as.numeric(time) %% 86400, origin="1970-01-01", tz="GMT"))
pal <- cols$cols %>%
setNames(cols$userId)
shared_ts <- highlight_key(ts_out)
palette(brewer.pal(100, "Set3"))
gg <- shared_ts %>%
ggplot(aes(time, fill = userId, group = userId)) +
geom_histogram(bins=60) +
scale_color_manual(values = pal) +
labs(x = "Time", y = "Count") +
theme_tufte() +
NULL
filter2 <- bscols(
ggplotly(gg, dynamicTicks = TRUE),
widths = c(12, 12)
)
bscols(filter2)
signal_with_id <- clean_signals %>%
group_by(userId, word) %>%
mutate(count = sequence(n()))
p <- signals %>%
select(signal, pos) %>%
count(pos) %>%
add_tally(n, name = "total") %>%
mutate(perc = n/total) %>%
ggplot(aes(reorder(pos, n, FUN = min), perc, fill = pos)) +
geom_col() +
scale_y_continuous(labels = scales::percent) +
coord_flip() +
theme_hc() +
theme(legend.position = "none") +
labs(x = "", y = "", title = "Original")
p2 <- signals %>%
filter(signal == 0) %>%
select(signal, pos) %>%
count(pos) %>%
add_tally(n, name = "total") %>%
mutate(perc = n/total) %>%
ggplot(aes(reorder(pos, n, FUN = min), perc, fill = pos)) +
geom_col() +
scale_y_continuous(labels = scales::percent) +
coord_flip() +
theme_hc() +
theme(legend.position = "none") +
labs(x = "", y = "", title = "Unknown")
layout <- "
AAABBB
"
p + p2 + plot_layout(design = layout)

diff_df <- signal_with_id %>%
group_by(userId, word) %>%
mutate(diff = time - lag(time)) %>%
mutate(tran=paste(lag(signal),'->',signal)) %>%
ungroup() %>%
mutate(diff = diff/(60*60*24)) %>%
filter(diff < 15) %>%
filter(!(diff == 0 | is.na(diff))) %>%
select(word, diff, signal, tran)
p <- diff_df %>%
ggplot(aes(x=diff, color=tran)) +
stat_ecdf(geom="point", size=0.5)+
theme_hc() +
labs(x = "Days", y = "", title = "Wait Time CDF") +
scale_fill_discrete("")
p2 <- diff_df %>%
count(tran) %>%
add_tally(n, name = "total") %>%
mutate(perc = n/total) %>%
ggplot(aes(reorder(tran, n, FUN = min), perc, fill = tran)) +
geom_col() +
scale_y_continuous(labels = scales::percent) +
coord_flip() +
theme_hc() +
theme(legend.position = "none") +
labs(x = "", y = "", title = "Transition Type")
p3 <- diff_df %>%
ggplot(aes(x=diff, fill='cdf', show.legend = FALSE)) +
stat_ecdf(aes(ymin=0,ymax=..y..), geom = "ribbon") +
theme_hc() +
theme(legend.position="none") +
labs(x = "Days", y = "", title = "Wait Time CDF by Transition Type")
layout <- "
AAACC
BBBBB
BBBBB
BBBBB
"
p3 + p + p2 + plot_layout(design = layout)
